import csv
import time

import requests
from lxml import etree

# url='http://search.dangdang.com/?key=%CA%FD%BE%DD%B7%D6%CE%F6&act=input&page_index=1'
#请求头
page = 1
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
fq = open('./dang_book.csv','w+',newline='',encoding='utf-8')
writer=csv.writer(fq)
writer.writerow(['书名','价格','原价','作者','出版日期','出版社','评论数','简介'])
def get_info(url):
    rqq=requests.get(url,headers=headers)
    rqq.encoding='GB2312' #网站编码格式
    select=etree.HTML(rqq.text,etree.HTMLParser())
    book_infos=select.xpath('//li[starts-with(@class,"line")]')
    for book_info in book_infos:
        name = book_info.xpath('./p[1]/a/@title')  #取出图书的名称
        price = book_info.xpath('./p[3]/span[1]/text()') #价格
        cost = book_info.xpath('./p[3]/span[2]/text()') #原价
        author1 = book_info.xpath('./p[5]/span[1]/a/@title')  #作者
        author2 = book_info.xpath('./p[6]/span[1]/a/@title')  # 作者
        author3 = book_info.xpath('./p[4]/span[1]/a/@title')  # 作者
        if(len(author1)!=0):
            author=author1[0]
        elif(len(author2) != 0):
            author = author2[0]
        elif(len(author3) != 0):
            author = author3[0]
        else:
            author='空'
        data1 = book_info.xpath('./p[5]/span[2]/text()')   #日期
        data2 = book_info.xpath('./p[6]/span[2]/text()')  # 日期
        if(len(data1)!=0):
            data=data1[0]
        elif(len(data2)!=0):
            data = data2[0]
        else:
            data='空'
        publish1 = book_info.xpath('./p[5]/span[3]/a/text()')  #出版社
        publish2 = book_info.xpath('./p[6]/span[3]/a/text()')  # 出版社
        if (len(publish1) != 0):
            press = publish1[0]
        elif (len(publish2) != 0):
            press = publish2[0]
        else:
            data = '空'
        comment = book_info.xpath('./p[4]/a/text()')  #评论人数
        detail  = book_info.xpath('./p[2]/text()')
        # info ={
        #     '书名':name,
        #     '价格':price,
        #     '原价':cost,
        #     '作者':author,
        #     '日期':data,
        #     '出版社':publish,
        #     '评论人数':comment,
        #     '简介':detail
        # }
        # print(info)
        writer.writerow((name,price,cost,author,data,press,comment,detail))

if __name__=='__main__':
    urls=['http://search.dangdang.com/?key=%CA%FD%BE%DD%B7%D6%CE%F6&act=input&page_index={}'.format(str(i)) for i in range(1,101)]
    for url in urls:
        get_info(url)
        time.sleep(2)
        print('已经爬取了第'+str(page)+'页')
        page=page+1
    fq.close()






